Data description

Air quality data collected at outdoor monitors across the United States, Puerto Rico, and the U.S. Virgin Islands. The data comes primarily from the AOS data base. (Among which we chose the data of Ozone and SO2.)

Library used

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggmap)

Load data

df<- read_csv("Ozone.zip")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Site Num` = col_integer(),
##   `Parameter Code` = col_integer(),
##   POC = col_integer(),
##   Latitude = col_double(),
##   Longitude = col_double(),
##   `Date Local` = col_date(format = ""),
##   `Time Local` = col_time(format = ""),
##   `Date GMT` = col_date(format = ""),
##   `Time GMT` = col_time(format = ""),
##   `Sample Measurement` = col_double(),
##   MDL = col_double(),
##   `Date of Last Change` = col_date(format = "")
## )
## See spec(...) for full column specifications.
df2<- read_csv("SO2.zip")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Parameter Code` = col_integer(),
##   POC = col_integer(),
##   Latitude = col_double(),
##   Longitude = col_double(),
##   `Date Local` = col_date(format = ""),
##   `Time Local` = col_time(format = ""),
##   `Date GMT` = col_date(format = ""),
##   `Time GMT` = col_time(format = ""),
##   `Sample Measurement` = col_double(),
##   MDL = col_double(),
##   `Method Code` = col_integer(),
##   `Date of Last Change` = col_date(format = "")
## )
## See spec(...) for full column specifications.
head(df)
## # A tibble: 6 × 24
##   `State Code` `County Code` `Site Num` `Parameter Code`   POC Latitude
##          <chr>         <chr>      <int>            <int> <int>    <dbl>
## 1           01           049       9991            44201     1   34.289
## 2           01           049       9991            44201     1   34.289
## 3           01           049       9991            44201     1   34.289
## 4           01           049       9991            44201     1   34.289
## 5           01           049       9991            44201     1   34.289
## 6           01           049       9991            44201     1   34.289
## # ... with 18 more variables: Longitude <dbl>, Datum <chr>, `Parameter
## #   Name` <chr>, `Date Local` <date>, `Time Local` <time>, `Date
## #   GMT` <date>, `Time GMT` <time>, `Sample Measurement` <dbl>, `Units of
## #   Measure` <chr>, MDL <dbl>, Uncertainty <chr>, Qualifier <chr>, `Method
## #   Type` <chr>, `Method Code` <chr>, `Method Name` <chr>, `State
## #   Name` <chr>, `County Name` <chr>, `Date of Last Change` <date>
head(df2)
## # A tibble: 6 × 24
##   `State Code` `County Code` `Site Num` `Parameter Code`   POC Latitude
##          <chr>         <chr>      <chr>            <int> <int>    <dbl>
## 1           01           073       0023            42401     2 33.55306
## 2           01           073       0023            42401     2 33.55306
## 3           01           073       0023            42401     2 33.55306
## 4           01           073       0023            42401     2 33.55306
## 5           01           073       0023            42401     2 33.55306
## 6           01           073       0023            42401     2 33.55306
## # ... with 18 more variables: Longitude <dbl>, Datum <chr>, `Parameter
## #   Name` <chr>, `Date Local` <date>, `Time Local` <time>, `Date
## #   GMT` <date>, `Time GMT` <time>, `Sample Measurement` <dbl>, `Units of
## #   Measure` <chr>, MDL <dbl>, Uncertainty <chr>, Qualifier <chr>, `Method
## #   Type` <chr>, `Method Code` <int>, `Method Name` <chr>, `State
## #   Name` <chr>, `County Name` <chr>, `Date of Last Change` <date>

Understand the Data

#Overlook of Method used
df%>%
  group_by(`Method Name`)%>%
  summarise(n())
## # A tibble: 6 × 2
##                                                     `Method Name`  `n()`
##                                                             <chr>  <int>
## 1        Instrumental - Chemiluminescence API Model 265E and T265   3505
## 2                               Instrumental - Ecotech Serinus 10   2910
## 3                                     INSTRUMENTAL - ULTRA VIOLET 808729
## 4                          INSTRUMENTAL - ULTRA VIOLET ABSORPTION 845830
## 5                   INSTRUMENTAL - ULTRAVIOLET RADIATION ABSORBTN   8189
## 6 Instrumental - UV absorption photometry/UV 2B model 202 and 205   8562
#See if there is difference between the data generated using different Method.
df%>%
  group_by(`Method Name`)%>%
  summarise(mean_measure = mean(`Sample Measurement`))
## # A tibble: 6 × 2
##                                                     `Method Name`
##                                                             <chr>
## 1        Instrumental - Chemiluminescence API Model 265E and T265
## 2                               Instrumental - Ecotech Serinus 10
## 3                                     INSTRUMENTAL - ULTRA VIOLET
## 4                          INSTRUMENTAL - ULTRA VIOLET ABSORPTION
## 5                   INSTRUMENTAL - ULTRAVIOLET RADIATION ABSORBTN
## 6 Instrumental - UV absorption photometry/UV 2B model 202 and 205
## # ... with 1 more variables: mean_measure <dbl>
ggplot(df)+
  geom_boxplot(aes(x = `Method Name`, y = `Sample Measurement`))

The data of sample measurement collected by different Method shows little difference.

Data analysis

Ozone <-df%>%
  mutate(Time_in_Hour = `Time Local`/3600)
SO2 <-df2%>%
  mutate(Time_in_Hour1 = `Time Local`/3600)

Time vs. Sample Measurement

#Boxplot of time(minute) with Measure separated by the method Name
ggplot(Ozone)+
  geom_boxplot(mapping = aes(x=factor(Time_in_Hour), y=`Sample Measurement`)) +
  facet_wrap(~`Method Name`)

geom_point(mapping = aes(x=Time_in_hour , y= `Sample measurement`))
## mapping: x = Time_in_hour, y = `Sample measurement` 
## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
ggplot(SO2)+
  geom_boxplot(mapping = aes(x=factor(Time_in_Hour1), y=`Sample Measurement`)) +
  facet_wrap(~`Method Name`)

geom_point(mapping = aes(x=Time_in_hour , y= `Sample measurement`))
## mapping: x = Time_in_hour, y = `Sample measurement` 
## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity

The plot showed that the amount of Ozone are generally richer at afternoon(10-17)

Geographical Distribution of Ozone

map <- get_map("the United States of America", zoom = 4, maptype = 'hybrid',
                      source = 'google', color='color')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=the+United+States+of+America&zoom=4&size=640x640&scale=2&maptype=hybrid&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=the%20United%20States%20of%20America&sensor=false
Df_New <- 
  df%>%
  group_by(`State Name`) %>%
  mutate(mean_measure = mean(`Sample Measurement`)) %>%
  select(mean_measure, `State Name`, Longitude, Latitude) %>%
  unique()

ggmap(map) + 
  geom_point(data = Df_New, aes(x = Longitude, y = Latitude, colour = mean_measure), size = 3, alpha = 0.5)

Df2_New <- 
  df2 %>%
  group_by(`State Name`) %>%
  mutate(mean_measure = mean(`Sample Measurement`)) %>%
  select(mean_measure, `State Name`, Longitude, Latitude) %>%
  unique()

ggmap(map) + 
  geom_point(data = Df2_New, aes(x = Longitude, y = Latitude, colour = mean_measure), size = 3, alpha = 0.5)

Map shows that the Ozone are rich in the east and west coast of the United States usually contains higher amount of Ozone. And it is clear that the places covered with vegetation has more concentrated and wider covarage of Ozone. Especially around the lake area and coastal area. Map also shows that concentrated SO2 are distributed in the Northeast part and south west part of the United States.

SO2 vs. Ozone

OzoneAmount <- df %>% select(`Sample Measurement`)
SO2Amount <- df2 %>% select(`Sample Measurement`)
df3 <- merge(OzoneAmount,SO2Amount)
ggplot(df)
  geom_point(mapping = aes(x = OzoneAmount,SO2Amount))

Conclusion